{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hnswlib\n",
    "\n",
    "Hnswlib is a fast approximate nearest neighbor search index. It's a lightweight, header-only C++ HNSW implementation that has no dependencies other than C++11. Hnswlib provides python bindings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index\n",
    "%pip install llama-index-vector-stores-hnswlib\n",
    "%pip install llama-index-embeddings-huggingface\n",
    "%pip install hnswlib"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import package dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.vector_stores.hnswlib import HnswlibVectorStore\n",
    "from llama_index.core import (\n",
    "    VectorStoreIndex,\n",
    "    StorageContext,\n",
    "    SimpleDirectoryReader,\n",
    ")\n",
    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load example data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p 'data/paul_graham/'\n",
    "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n",
    "print(f\"Total documents: {len(documents)}\")\n",
    "print(f\"First document, id: {documents[0].doc_id}\")\n",
    "print(f\"First document, hash: {documents[0].hash}\")\n",
    "print(\n",
    "    \"First document, text\"\n",
    "    f\" ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the embedding model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embed_model = HuggingFaceEmbedding(\n",
    "    model_name=\"sentence-transformers/all-MiniLM-L6-v2\",\n",
    "    normalize=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Hnswlib Vector Store object from Hnswlib.Index parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hnswlib_vector_store = HnswlibVectorStore.from_params(\n",
    "    space=\"ip\",\n",
    "    dimension=embed_model._model.get_sentence_embedding_dimension(),\n",
    "    max_elements=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Alternatively, You can create a Hnswlib.Index object Yourself."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import hnswlib\n",
    "\n",
    "index = hnswlib.Index(\n",
    "    \"ip\", embed_model._model.get_sentence_embedding_dimension()\n",
    ")\n",
    "index.init_index(max_elements=1000)\n",
    "\n",
    "hnswlib_vector_store = HnswlibVectorStore(index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build index from documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hnswlib_storage_context = StorageContext.from_defaults(\n",
    "    vector_store=hnswlib_vector_store\n",
    ")\n",
    "hnswlib_index = VectorStoreIndex.from_documents(\n",
    "    documents,\n",
    "    storage_context=hnswlib_storage_context,\n",
    "    embed_model=embed_model,\n",
    "    show_progress=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Query index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 5\n",
    "query = \"Before college I wrote what begginers should write.\"\n",
    "hnswlib_vector_retriever = hnswlib_index.as_retriever(similarity_top_k=k)\n",
    "nodes_with_scores = nodes_with_scores = hnswlib_vector_retriever.retrieve(\n",
    "    query\n",
    ")\n",
    "for node in nodes_with_scores:\n",
    "    print(f\"Node {node.id_} | Score: {node.score:.3f} - {node.text[:120]}...\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama-index-hd-gQzJx-py3.10",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
